import pandas as pd
import datetime as dt
import numpy as np
from pandas_profiling import ProfileReport
import cufflinks as cf
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected= True)
cf.go_offline()
data = pd.read_csv('Datasets/kc_house_data.csv')
display(data.sample(5))
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14418 | 7215730120 | 20150504T000000 | 606500.0 | 3 | 2.50 | 2170 | 5500 | 2.0 | 0 | 0 | ... | 8 | 2170 | 0 | 2000 | 0 | 98075 | 47.5975 | -122.018 | 2170 | 5862 |
| 9764 | 7982600030 | 20140812T000000 | 219000.0 | 3 | 1.50 | 1200 | 12000 | 1.0 | 0 | 0 | ... | 7 | 1200 | 0 | 1986 | 0 | 98001 | 47.2680 | -122.245 | 1200 | 9405 |
| 1531 | 2864600105 | 20140624T000000 | 819000.0 | 3 | 3.50 | 2130 | 6150 | 2.0 | 0 | 2 | ... | 8 | 1530 | 600 | 1908 | 0 | 98199 | 47.6491 | -122.405 | 2040 | 5381 |
| 5629 | 3541600235 | 20141028T000000 | 350000.0 | 3 | 1.75 | 1900 | 10225 | 1.0 | 0 | 0 | ... | 8 | 1220 | 680 | 1963 | 0 | 98166 | 47.4781 | -122.357 | 1850 | 12630 |
| 4825 | 2741100280 | 20140513T000000 | 415000.0 | 3 | 1.75 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 6 | 980 | 980 | 1911 | 0 | 98108 | 47.5576 | -122.317 | 1790 | 5000 |
5 rows × 21 columns
dimensionData = data.shape
print(f'This dataset has {dimensionData[0]} properties and {dimensionData[1]} available attributes.')
This dataset has 21613 properties and 21 available attributes.
list(data.columns)
['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
data.info()
#This dataset has no null values
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null float64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(5), int64(15), object(1) memory usage: 3.5+ MB
#Note 1: What is the relevance of some columns in this analysis?
#Columns to watch: 'view', 'grade', 'sqft_living15', 'sqft_lot15'
#Identify, transform or delete;
#Note 2: Transform the attribute type 'date' and verify its compliance;
#Note 3: It's necessary transforming 'condition' and 'waterview' in object type for response business questions
#view
data['view'].unique()
#Analyze: Scale how good the view is.
#Decision: Transform column type to object with classifications
# 0 and 1 - bad
# 2 - regular
# 3 and 4 - good
array([0, 3, 4, 2, 1])
data['view'] = data['view'].apply(lambda x: 'bad' if x < 2 else 'regular' if x == 2 else 'good')
data['view'].unique()
array(['bad', 'good', 'regular'], dtype=object)
#grade
data['grade'].unique()
#It's an index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
#Defcision - Change the attribute 'grade' to low_quality_design, average_quality_design and high_quality_design
array([ 7, 6, 8, 11, 9, 5, 10, 12, 4, 3, 13, 1])
data['grade'] = data['grade'].apply (lambda x: 'high_quality_design' if x >= 11 else
'avg_quality_design' if (x > 3) & (x <= 10) else
'low_quality_design')
data['grade'].unique()
array(['avg_quality_design', 'high_quality_design', 'low_quality_design'],
dtype=object)
#Deleting unnecessary columns
#sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors; unnecessary
#sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors; unnecessary
data = data.drop(['sqft_living15' , 'sqft_lot15'], axis= 1)
print(f'Now, this dataset has {data.shape[1]} attributes')
Now, this dataset has 19 attributes
#Checking Duplicates values
display(data[data['id'].duplicated(keep= False)])
display(data[data[['id', 'price']].duplicated(keep= False)])
#There are two types of duplicate data:
#os with same id but different prices(updated);
#and those with the same id and the same price;
#The most outdated types in relation to the date will be eliminated
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 93 | 6021501535 | 20140725T000000 | 430000.0 | 3 | 1.50 | 1580 | 5000 | 1.0 | 0 | bad | 3 | avg_quality_design | 1290 | 290 | 1939 | 0 | 98117 | 47.6870 | -122.386 |
| 94 | 6021501535 | 20141223T000000 | 700000.0 | 3 | 1.50 | 1580 | 5000 | 1.0 | 0 | bad | 3 | avg_quality_design | 1290 | 290 | 1939 | 0 | 98117 | 47.6870 | -122.386 |
| 313 | 4139480200 | 20140618T000000 | 1384000.0 | 4 | 3.25 | 4290 | 12103 | 1.0 | 0 | good | 3 | high_quality_design | 2690 | 1600 | 1997 | 0 | 98006 | 47.5503 | -122.102 |
| 314 | 4139480200 | 20141209T000000 | 1400000.0 | 4 | 3.25 | 4290 | 12103 | 1.0 | 0 | good | 3 | high_quality_design | 2690 | 1600 | 1997 | 0 | 98006 | 47.5503 | -122.102 |
| 324 | 7520000520 | 20140905T000000 | 232000.0 | 2 | 1.00 | 1240 | 12092 | 1.0 | 0 | bad | 3 | avg_quality_design | 960 | 280 | 1922 | 1984 | 98146 | 47.4957 | -122.352 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20670 | 8564860270 | 20150330T000000 | 502000.0 | 4 | 2.50 | 2680 | 5539 | 2.0 | 0 | bad | 3 | avg_quality_design | 2680 | 0 | 2013 | 0 | 98045 | 47.4759 | -121.734 |
| 20779 | 6300000226 | 20140626T000000 | 240000.0 | 4 | 1.00 | 1200 | 2171 | 1.5 | 0 | bad | 3 | avg_quality_design | 1200 | 0 | 1933 | 0 | 98133 | 47.7076 | -122.342 |
| 20780 | 6300000226 | 20150504T000000 | 380000.0 | 4 | 1.00 | 1200 | 2171 | 1.5 | 0 | bad | 3 | avg_quality_design | 1200 | 0 | 1933 | 0 | 98133 | 47.7076 | -122.342 |
| 21580 | 7853420110 | 20141003T000000 | 594866.0 | 3 | 3.00 | 2780 | 6000 | 2.0 | 0 | bad | 3 | avg_quality_design | 2780 | 0 | 2013 | 0 | 98065 | 47.5184 | -121.886 |
| 21581 | 7853420110 | 20150504T000000 | 625000.0 | 3 | 3.00 | 2780 | 6000 | 2.0 | 0 | bad | 3 | avg_quality_design | 2780 | 0 | 2013 | 0 | 98065 | 47.5184 | -121.886 |
353 rows × 19 columns
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3950 | 1825069031 | 20140814T000000 | 550000.0 | 4 | 1.75 | 2410 | 8447 | 2.0 | 0 | good | 4 | avg_quality_design | 2060 | 350 | 1936 | 1980 | 98074 | 47.6499 | -122.088 |
| 3951 | 1825069031 | 20141016T000000 | 550000.0 | 4 | 1.75 | 2410 | 8447 | 2.0 | 0 | good | 4 | avg_quality_design | 2060 | 350 | 1936 | 1980 | 98074 | 47.6499 | -122.088 |
| 14982 | 6308000010 | 20141208T000000 | 585000.0 | 3 | 2.50 | 2290 | 5089 | 2.0 | 0 | bad | 3 | avg_quality_design | 2290 | 0 | 2001 | 0 | 98006 | 47.5443 | -122.172 |
| 14983 | 6308000010 | 20150423T000000 | 585000.0 | 3 | 2.50 | 2290 | 5089 | 2.0 | 0 | bad | 3 | avg_quality_design | 2290 | 0 | 2001 | 0 | 98006 | 47.5443 | -122.172 |
| 20053 | 8648900110 | 20140505T000000 | 555000.0 | 3 | 2.50 | 1940 | 3211 | 2.0 | 0 | bad | 3 | avg_quality_design | 1940 | 0 | 2009 | 0 | 98027 | 47.5644 | -122.093 |
| 20054 | 8648900110 | 20140826T000000 | 555000.0 | 3 | 2.50 | 1940 | 3211 | 2.0 | 0 | bad | 3 | avg_quality_design | 1940 | 0 | 2009 | 0 | 98027 | 47.5644 | -122.093 |
#The rows with duplicate values that have the parameter 'keep' equal to 'first' are the most up-to-date data
#So these rows will remain.
#Deleting duplicates values
data = data.drop_duplicates(subset= ['id', 'price'], keep= 'first')
data = data.drop_duplicates(subset= ['id'], keep= 'first')
display(data[data[['id', 'price']].duplicated(keep= False)])
display(data[data['id'].duplicated(keep= False)])
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long |
|---|
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long |
|---|
#Reseting Index
data.reset_index(drop= True, inplace= True)
print(f'Now, this dataset has {data.shape[0]} registered properties')
Now, this dataset has 21436 registered properties
#Transforming date format for datetime
data['date'] = pd.to_datetime(data['date'], format= '%Y-%m-%d', errors= 'ignore')
#Separing and transforming date format to int
data['year'] = data['date'].dt.strftime('%Y')
data['year'] = data['year'].astype(int)
data['month'] = data['date'].dt.strftime('%m')
data['month'] = data['month'].astype(int)
data['days'] = data['date'].dt.strftime('%d')
data['days'] = data['days'].astype(int)
print(f'Now, this dataset has {data.shape[1]} attributes')
Now, this dataset has 22 attributes
data.columns.tolist()
['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'year', 'month', 'days']
#Checking attribute compliance
describeDate = data[['year', 'month', 'days']].describe()
display(describeDate)
print('> YEAR')
print('The maximum year of property registration is {}'.format(data['year'].max()))
print('The minimum year of property registration is {}'.format(data['year'].min()))
print('> MONTH')
print('The maximum month of property registration is {}'.format(data['month'].max()))
print('The minimum month of property registration is {}'.format(data['month'].min()))
print('> DAYS')
print('The maximum days of property registration is {}'.format(data['days'].max()))
print('The minimum days of property registration is {}'.format(data['days'].min()))
| year | month | days | |
|---|---|---|---|
| count | 21436.000000 | 21436.000000 | 21436.000000 |
| mean | 2014.318949 | 6.590782 | 15.683476 |
| std | 0.466080 | 3.108176 | 8.636763 |
| min | 2014.000000 | 1.000000 | 1.000000 |
| 25% | 2014.000000 | 4.000000 | 8.000000 |
| 50% | 2014.000000 | 6.000000 | 16.000000 |
| 75% | 2015.000000 | 9.000000 | 23.000000 |
| max | 2015.000000 | 12.000000 | 31.000000 |
> YEAR The maximum year of property registration is 2015 The minimum year of property registration is 2014 > MONTH The maximum month of property registration is 12 The minimum month of property registration is 1 > DAYS The maximum days of property registration is 31 The minimum days of property registration is 1
#The date conforms to reality.
#Turning numerics atrribute into object attributes
#condition
# - If the value of the column "condition" is less than or equal to 2 > 'bad'
# - If the value of the column "condition" is equal to 3 or 4 > 'regular'
# - If the value of the column "condition" is equal to 5 > 'good'
data.loc[data['condition'] <= 2, 'condition'] = 'bad'
data.loc[(data['condition'] == 3) | (data['condition'] == 4), 'condition'] = 'regular'
data.loc[data['condition'] == 5, 'condition'] = 'good'
display(data['condition'].unique())
array(['regular', 'good', 'bad'], dtype=object)
#waterfront
display(data['waterfront'].unique())
#0 - No
#1- Yes
data.loc[data['waterfront'] == 0, 'waterfront'] = 'No'
data.loc[data['waterfront'] == 1, 'waterfront'] = 'Yes'
display(data['waterfront'].unique())
array([0, 1])
array(['No', 'Yes'], dtype=object)
data.dtypes
id int64 date datetime64[ns] price float64 bedrooms int64 bathrooms float64 sqft_living int64 sqft_lot int64 floors float64 waterfront object view object condition object grade object sqft_above int64 sqft_basement int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 year int64 month int64 days int64 dtype: object
numericTypes = data.select_dtypes(include= 'number')
numericTypes = numericTypes.drop(['year' ,'month', 'days', 'lat', 'long'], axis= 1)
#Exploratory Analysis with pandas_profiling
profile = ProfileReport(numericTypes)
profile.to_notebook_iframe()